# -*- coding:utf-8 -*-
import json
import os

class PoetryToWord:

    @staticmethod
    def loadWord():
        # 获取文件列表
        fileList = PoetryToWord.fileList()

        # 解析文件，提取文字，set去重
        wordSet = set()
        for f in fileList:
            PoetryToWord.parseFile(wordSet, f)

        # 排序后写入文件
        PoetryToWord.writeWordTofile(wordSet)


    @staticmethod
    def fileList():
        # 列出文件夹下所有的目录与文件
        fileDir = "./poetry/"
        fileList = list()
        for fileName in os.listdir(fileDir):
            filePath = fileDir + fileName
            if os.path.isfile(filePath):
                fileList.append(filePath)
        print("============================= 1、fileList: ", fileList)
        return fileList

    @staticmethod
    def parseFile(wordSet: set, file: str):
        print("============================= 2、parseFile: ", file)
        with open(file, 'r', encoding='utf-8') as jsonFile:
            jsonList = json.load(jsonFile)
            for j in jsonList:
                content = j["content"]
                if (None == content or 0 == len(content)):
                    continue
                for c in content:
                    if PoetryToWord.isChinese(c):
                        wordSet.add(c)

    @staticmethod
    def isChinese(ch):
        return '\u4e00' <= ch <= '\u9fff'

    @staticmethod
    def writeWordTofile(wordSet: set):
        # set转list
        wordList = list(wordSet)
        # 进行排序
        wordList.sort()

        print(len(wordList))
        print(wordList)

        writter = open("./word/words.txt", 'w', encoding='utf-8')
        for w in wordList:
            writter.write(w+"\n")


if __name__ == '__main__':
    PoetryToWord.loadWord()